# Provides a session log for the packages used in this .rmd
library(sessioninfo)
session_info(pkgs = "!attached", to_file = "03a_session_log.txt")

1 Introduction

In this script, we go through all the pre-registered proposed analyses. As a reminder, the main research questions where as follows:

  1. To what extent does infants’ preference for IDS as measured in a laboratory setting predict their vocabulary at 18 and 24 months?
  2. Does the relation between IDS preference and vocabulary size change over development?
  3. Are there systematic differences in the strength of this relationship across the language communities in our sample?

Here we present the main “sample theory based” analyses (also known as frequentist), separately on the North American and UK samples in parallel to answer our first two research questions, then together to answer our third research question. In the next section (03b) provide additional Bayesian statistics where a null effect was found, as specified in the pre-registration.

# Library imports, general settings ==============
library(tidyverse)
library(egg)
library(knitr)
library(lme4)
library(lmerTest)
library(simr)
library(modelsummary)  #<= creates and exports the  tables for the models as word doc
 
# As in our discussion with Mike, we will use lmerTest for calculating p values
library(lattice)
library(effects)
library(sjPlot)
library(robustlmm)
library(car)
library(papaja)
theme_apa <- papaja::theme_apa

# Load model comparison functions
source("helper/lrtests.R")

# Deal with package priority issues
select <- dplyr::select

theme_set(theme_bw(base_size = 10))
options("future" = T)
# knitr::opts_chunk$set(cache = TRUE)

print(sessionInfo()) # listing all info about R and packages info
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] furrr_0.3.1          future.apply_1.11.0  future_1.32.0       
##  [4] bridgesampling_1.1-2 rstan_2.21.8         StanHeaders_2.26.26 
##  [7] brms_2.19.0          Rcpp_1.0.10          papaja_0.1.1        
## [10] tinylabels_0.2.3     car_3.1-2            robustlmm_3.2-0     
## [13] sjPlot_2.8.14        effects_4.2-2        carData_3.0-5       
## [16] lattice_0.21-8       modelsummary_1.4.1   simr_1.0.7          
## [19] lmerTest_3.1-3       lme4_1.1-33          Matrix_1.5-4        
## [22] knitr_1.42           egg_0.4.5            gridExtra_2.3       
## [25] lubridate_1.9.2      forcats_1.0.0        stringr_1.5.0       
## [28] dplyr_1.1.2          purrr_1.0.1          readr_2.1.4         
## [31] tidyr_1.3.0          tibble_3.2.1         ggplot2_3.4.2       
## [34] tidyverse_2.0.0      sessioninfo_1.2.2   
## 
## loaded via a namespace (and not attached):
##   [1] shinythemes_1.2.0       splines_4.3.0           later_1.3.1            
##   [4] datawizard_0.7.1        xts_0.13.1              lifecycle_1.0.3        
##   [7] globals_0.16.2          processx_3.8.1          MASS_7.3-58.4          
##  [10] crosstalk_1.2.0         insight_0.19.2          backports_1.4.1        
##  [13] survey_4.2-1            magrittr_2.0.3          sass_0.4.6             
##  [16] rmarkdown_2.21          jquerylib_0.1.4         yaml_2.3.7             
##  [19] plotrix_3.8-2           httpuv_1.6.11           zip_2.3.0              
##  [22] askpass_1.1             pkgbuild_1.4.1          DBI_1.1.3              
##  [25] minqa_1.2.5             abind_1.4-5             rvest_1.0.3            
##  [28] nnet_7.3-18             tensorA_0.36.2          gdtools_0.3.3          
##  [31] inline_0.3.19           pbkrtest_0.5.2          listenv_0.9.0          
##  [34] crul_1.4.0              performance_0.10.3      parallelly_1.36.0      
##  [37] svglite_2.1.1           codetools_0.2-19        DT_0.28                
##  [40] xml2_1.3.4              tidyselect_1.2.0        bayesplot_1.10.0       
##  [43] ggeffects_1.2.3         httpcode_0.3.0          farver_2.1.1           
##  [46] effectsize_0.8.3        stats4_4.3.0            base64enc_0.1-3        
##  [49] matrixStats_1.0.0       webshot_0.5.4           jsonlite_1.8.4         
##  [52] ellipsis_0.3.2          survival_3.5-5          iterators_1.0.14       
##  [55] emmeans_1.8.6           systemfonts_1.0.4       tools_4.3.0            
##  [58] ragg_1.2.5              binom_1.1-1.1           glue_1.6.2             
##  [61] xfun_0.39               mgcv_1.8-42             distributional_0.3.2   
##  [64] loo_2.6.0               withr_2.5.0             numDeriv_2016.8-1.1    
##  [67] fastmap_1.1.1           mitools_2.4             boot_1.3-28.1          
##  [70] fansi_1.0.4             shinyjs_2.1.0           openssl_2.0.6          
##  [73] callr_3.7.3             digest_0.6.31           timechange_0.2.0       
##  [76] R6_2.5.1                mime_0.12               estimability_1.4.1     
##  [79] textshaping_0.3.6       colorspace_2.1-0        gtools_3.9.4           
##  [82] markdown_1.7            threejs_0.3.3           utf8_1.2.3             
##  [85] generics_0.1.3          fontLiberation_0.1.0    data.table_1.14.8      
##  [88] robustbase_0.95-1       prettyunits_1.1.1       httr_1.4.6             
##  [91] htmlwidgets_1.6.2       parameters_0.21.1       pkgconfig_2.0.3        
##  [94] dygraphs_1.1.1.6        gtable_0.3.3            htmltools_0.5.5        
##  [97] fontBitstreamVera_0.1.1 scales_1.2.1            kableExtra_1.3.4       
## [100] posterior_1.4.1         rstudioapi_0.14         reshape2_1.4.4         
## [103] tzdb_0.4.0              uuid_1.1-0              coda_0.19-4            
## [106] checkmate_2.2.0         nlme_3.1-162            curl_5.0.0             
## [109] nloptr_2.0.3            zoo_1.8-12              cachem_1.0.8           
## [112] flextable_0.9.1         sjlabelled_1.2.0        RLRsim_3.1-8           
## [115] miniUI_0.1.1.1          parallel_4.3.0          pillar_1.9.0           
## [118] grid_4.3.0              vctrs_0.6.2             shinystan_2.6.0        
## [121] promises_1.2.0.1        xtable_1.8-4            evaluate_0.21          
## [124] fastGHQuad_1.0.1        mvtnorm_1.1-3           cli_3.6.1              
## [127] compiler_4.3.0          rlang_1.1.1             crayon_1.5.2           
## [130] rstantools_2.3.1        modelr_0.1.11           ps_1.7.5               
## [133] plyr_1.8.8              sjmisc_2.8.9            stringi_1.7.12         
## [136] viridisLite_0.4.2       tables_0.9.17           munsell_0.5.0          
## [139] colourpicker_1.2.0      Brobdingnag_1.2-9       bayestestR_0.13.1      
## [142] fontquiver_0.2.1        sjstats_0.18.2          hms_1.1.3              
## [145] gfonts_0.2.0            shiny_1.7.4             igraph_1.4.3           
## [148] broom_1.0.4             RcppParallel_5.1.7      bslib_0.4.2            
## [151] DEoptimR_1.0-14         officer_0.6.2
# Read data ======================================
col_types <- cols(
  labid = col_factor(),
  subid = col_factor(),
  subid_unique = col_factor(),
  CDI.form = col_factor(),
  CDI.nwords = col_integer(),
  CDI.prop = col_number(),
  CDI.agerange = col_factor(),
  CDI.agedays = col_integer(),
  CDI.agemin = col_integer(),
  CDI.agemax = col_integer(),
  vocab_nwords = col_integer(),
  standardized.score.CDI = col_character(),
  standardized.score.CDI.num = col_number(),
  z.IDS_pref = col_number(),
  language = col_factor(),
  language_zone = col_factor(),
  CDI.error = col_logical(),
  Notes = col_character(),
  trial_order = col_factor(),
  method = col_factor(),
  age_days = col_integer(),
  age_mo = col_number(),
  age_group = col_factor(),
  nae = col_logical(),
  gender = col_factor(),
  second_session = col_logical()
)
data.total <- read_csv("data/02b_processed.csv", col_types = col_types)

Before moving on with the analysis, we have to ready the data by (a) checking for colinearity between z_age_months and CDI.z_age_months and correcting this if necessary, and (b) setting up the contrasts described in our data analysis.

1.1 Colinearity check

First, we run a Kappa test on the possibility of colinearity between z_age_months and CDI.z_age_months.

# Run kappa test
k.age_months <- model.matrix(~ z_age_months + CDI.z_age_months, data = data.total) %>%
  kappa(exact = T)

With a value of 3.2587054, we do not have a colinearity issue and can proceed with the data analysis as planned (The criteria of indicating colinearity is that kappa > 10).

1.2 Contrast Setups

We need gender as an effect-coded factor, and method as a deviation-coded factor. This is achieved in R by using the contr.sum() function with the number of levels for each factor. Notably, when subsetting the UK sample, only two levels of method out of the three in total were left.

# Set contrasts on the total dataset =============
contrasts(data.total$gender) <- contr.sum(2)
contrasts(data.total$method) <- contr.sum(3)

# Create sub-datasets, with contrasts ============
## NAE
data.nae <- data.total %>%
  subset(language_zone == "NAE") %>%
  droplevels()
contrasts(data.nae$gender) <- contr.sum(2)
contrasts(data.nae$method) <- contr.sum(3)

## UK (combined-age and separate 18/24 months)

data.uk <- data.total %>%
  subset(language_zone == "British") %>%
  droplevels()
contrasts(data.uk$gender) <- contr.sum(2)
contrasts(data.uk$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels


data.uk.18 <- data.total %>%
  subset(language_zone == "British" & CDI.agerange ==
    "18") %>%
  droplevels()
contrasts(data.uk.18$gender) <- contr.sum(2)
contrasts(data.uk.18$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels

data.uk.24 <- data.total %>%
  subset(language_zone == "British" & CDI.agerange ==
    "24") %>%
  droplevels()
contrasts(data.uk.24$gender) <- contr.sum(2)
contrasts(data.uk.24$method) <- contr.sum(2) # note that UK sample has only 2 levels, so sum of zero contrasts set to 2 levels



## Other
data.other <- data.total %>%
  subset(language_zone == "Other") %>%
  droplevels()
contrasts(data.other$gender) <- contr.sum(2)
contrasts(data.other$method) <- contr.sum(3)

2 Descriptive Statistics

We first assess the amount of data we have overall per condition and their shape overall.

data.total %>%
  group_by(language_zone, CDI.agerange, method, gender) %>%
  summarise(N = n(), age = mean(CDI.agedays), sd = sd(CDI.agedays)) %>%
  kable()
## `summarise()` has grouped output by 'language_zone', 'CDI.agerange', 'method'.
## You can override using the `.groups` argument.
language_zone CDI.agerange method gender N age sd
British 18 singlescreen F 21 548.7619 9.065896
British 18 singlescreen M 18 548.2778 9.448346
British 18 eyetracking F 8 546.6250 7.558108
British 18 eyetracking M 10 549.5000 8.934950
British 24 singlescreen F 12 734.6667 9.345230
British 24 singlescreen M 6 733.8333 4.400758
British 24 eyetracking F 12 729.2500 9.333469
British 24 eyetracking M 5 737.8000 8.228001
Other 18 singlescreen F 10 542.6000 6.586181
Other 18 singlescreen M 12 545.4167 5.567084
Other 18 eyetracking F 18 548.1667 6.862087
Other 18 eyetracking M 21 551.7143 6.341474
Other 18 hpp F 32 545.2500 6.530326
Other 18 hpp M 27 550.5556 7.890273
Other 24 singlescreen F 8 729.0000 7.131419
Other 24 singlescreen M 10 726.7000 4.372896
Other 24 eyetracking F 26 730.5385 5.623030
Other 24 eyetracking M 23 730.1304 5.504580
Other 24 hpp F 31 729.0645 9.312842
Other 24 hpp M 25 727.4800 7.665507
NAE 18 singlescreen F 19 554.9474 20.780530
NAE 18 singlescreen M 14 581.2143 24.925052
NAE 18 eyetracking F 1 549.0000 NA
NAE 18 hpp F 64 557.9375 22.801333
NAE 18 hpp M 66 556.1667 22.762035
NAE 24 singlescreen F 23 737.1739 26.604258
NAE 24 singlescreen M 20 739.6000 21.352678
NAE 24 eyetracking F 2 756.5000 34.648232
NAE 24 eyetracking M 1 745.0000 NA
NAE 24 hpp F 58 731.6897 28.149480
NAE 24 hpp M 65 726.2769 27.133068

Total number of children by age group

data.total %>%
  group_by(CDI.agerange) %>%
  summarise(N = n()) %>%
  kable()
CDI.agerange N
18 341
24 327

Total number of CDIs (not unique children)

data.total %>%
  summarise(N = n()) %>%
  kable()
N
668

Total number of CDIs (not unique children) by language zone

data.total %>%
  group_by(language_zone) %>%
  summarise(N = n()) %>%
  kable()
language_zone N
British 92
Other 243
NAE 333

Total number of unique children

data.total %>%
  distinct(subid_unique) %>%
  summarise(N = n()) %>%
  kable()
N
467

Total number of unique children per language zone

data.total %>%
  group_by(language_zone) %>%
  distinct(subid_unique) %>%
  summarise(N = n()) %>%
  kable()
language_zone N
British 76
Other 163
NAE 228

Total number of children per gender and age

data.total %>%
  group_by(gender,CDI.agerange) %>%
  summarise(N = n()) %>%
  kable()
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.
gender CDI.agerange N
F 18 173
F 24 172
M 18 168
M 24 155

Total number of children per gender and age

data.total %>%
  group_by(gender) %>%
  summarise(N = n()) %>%
  kable()
gender N
F 345
M 323

More detailed information about Descriptive Statistics

# number of lab
data.total %>%
  select(labid, language_zone) %>%
  unique() %>%
  group_by(language_zone) %>%
  count()
## # A tibble: 3 × 2
## # Groups:   language_zone [3]
##   language_zone     n
##   <fct>         <int>
## 1 British           4
## 2 Other             8
## 3 NAE               9
data.total %>%
  group_by(language_zone, CDI.agerange) %>%
  summarize(N = n())
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups:   language_zone [3]
##   language_zone CDI.agerange     N
##   <fct>         <fct>        <int>
## 1 British       18              57
## 2 British       24              35
## 3 Other         18             120
## 4 Other         24             123
## 5 NAE           18             164
## 6 NAE           24             169
# age range in each age group and language zone
data.total %>%
  select(subid, language_zone, CDI.agedays, CDI.agerange) %>%
  unique() %>%
  group_by(language_zone, CDI.agerange) %>%
  summarize(
    age_min = (min(CDI.agedays) / 365.25 * 12),
    age_max = (max(CDI.agedays) / 365.25 * 12)
  )
## `summarise()` has grouped output by 'language_zone'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups:   language_zone [3]
##   language_zone CDI.agerange age_min age_max
##   <fct>         <fct>          <dbl>   <dbl>
## 1 British       18              17.6    18.5
## 2 British       24              23.5    24.5
## 3 Other         18              17.5    18.5
## 4 Other         24              23.5    24.5
## 5 NAE           18              16.1    20.0
## 6 NAE           24              22.1    25.9

We then assess the data per lab in terms of sample size and CDI score (vocabulary size, for consistency between language zones).

by_lab <- data.total %>%
  group_by(labid, language_zone, CDI.agerange) %>%
  mutate(tested = n_distinct(subid_unique)) %>%
  select(labid, language_zone, CDI.agerange, tested, vocab_nwords) %>%
  nest(scores = vocab_nwords) %>%
  mutate(
    model = map(scores, ~ lm(vocab_nwords ~ 1, data = .x)),
    ci = map(model, confint)
  ) %>%
  transmute(
    tested = tested,
    mean = map_dbl(model, ~ coefficients(.x)[[1]]),
    ci_lower = map_dbl(ci, 1),
    ci_upper = map_dbl(ci, 2)
  ) %>%
  arrange(language_zone) %>%
  rownames_to_column() %>%
  ungroup()

2.1 Visualization by Lab (Mean and confidence intervals)

# created a new column with the labs IDs as character so it can be sorted in alphabetical order
by_lab$labid_car <- as.character(by_lab$labid)

# relabel the CDI age factor column
levels(by_lab$CDI.agerange) <- c("18-Month Olds", "24-Month Olds")

# Making sure the factor columns have levels in the order I would like to graph them
by_lab$language_zone <- factor(by_lab$language_zone, levels = c("Other", "NAE", "British"))

# sorted the idcolum in the way I would it to show in the ggplot
labid_ord <- by_lab %>%
  dplyr::arrange(language_zone, desc(labid_car)) %>%
  ungroup() %>%
  filter(CDI.agerange == "18-Month Olds") %>%
  select(labid)

# Making sure the factor columns have levels in the order I would like to graph them
by_lab$labid <- factor(by_lab$labid, levels = labid_ord$labid)

## graph by language zone and lab Id in asc order
by_lab %>%
  ggplot(
    .,
    aes(
      x = labid,
      y = mean, colour = language_zone,
      ymin = ci_lower, ymax = ci_upper
    )
  ) +
  geom_linerange() +
  geom_point(aes(size = tested)) +
  coord_flip(ylim = c(0, 500)) +
  xlab("Laboratory ID") +
  ylab("Average Vocabulary Size") +
  scale_colour_brewer(
    palette = "Dark2", name = "Language\nZone",
    breaks = c("British", "NAE", "Other")
  ) +
  scale_size_continuous(name = "Sample\nSize") +
  facet_wrap(vars(CDI.agerange)) +
  theme(  text = element_text(size = 20)) 

ggsave("plots/vocab-size_by-lab.png", width = 12, height = 8)

3 Sample Theory Based Statistics

3.1 Simple Correlation

First, we want to assess quickly if there is a direct correlation between IDS preference and CDI score, computing a Pearson#’s product-moment correlation. We use standardized CDI scores for the North American sample, and raw scores for the British sample. Since CDI grows with age, we run the British sample separately for 18 and 24 months.

# Statistics =====================================
## North American Sample
test.pearson.nae <- cor.test(data.nae$z.IDS_pref,
  data.nae$z_standardized_CDI,
  alternative = "two.sided", method = "pearson"
)

test.pearson.nae
## 
##  Pearson's product-moment correlation
## 
## data:  data.nae$z.IDS_pref and data.nae$z_standardized_CDI
## t = -0.91847, df = 305, p-value = 0.3591
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.16349833  0.05977293
## sample estimates:
##         cor 
## -0.05251901
## UK Sample
test.pearson.uk.18 <- cor.test(data.uk.18$z.IDS_pref,
  data.uk.18$z_vocab_nwords,
  alternative = "two.sided", method = "pearson"
)

test.pearson.uk.18
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.18$z.IDS_pref and data.uk.18$z_vocab_nwords
## t = -0.026578, df = 55, p-value = 0.9789
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2639050  0.2572241
## sample estimates:
##          cor 
## -0.003583759
test.pearson.uk.24 <- cor.test(data.uk.24$z.IDS_pref,
  data.uk.24$z_vocab_nwords,
  alternative = "two.sided", method = "pearson"
)

test.pearson.uk.24
## 
##  Pearson's product-moment correlation
## 
## data:  data.uk.24$z.IDS_pref and data.uk.24$z_vocab_nwords
## t = 0.45131, df = 33, p-value = 0.6547
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2617574  0.4010989
## sample estimates:
##        cor 
## 0.07832108

3.1.1 Plots for correlation

## North American Sample
### Get correlation value for annotation
cor_text <- "paste(italic(R)^2, \" =\")"
cor_value <- round(test.pearson.nae$estimate, 3)


### Build plot

xrng <- range(data.nae$IDS_pref)

plot.pearson.nae <- data.nae %>%
  ggplot(aes(
    x = IDS_pref,
    y = standardized.score.CDI.num
  )) +
  xlab("Standardized IDS Preference") +
  ylab("Standardized CDI Score") +
  labs(title="NAE") +
  geom_point(colour = "#D95F02") +
  geom_smooth(method = lm, color = "#D95F02") +
  annotate("text",x = xrng[2], y = 100,
   hjust = 1, vjust = 0, parse = T, size = 3,
    label = paste(cor_text, cor_value, sep = "~")
  ) + theme_apa()

## UK Sample
cor_value_18 <- round(test.pearson.uk.18$estimate, 3)
cor_value_24 <- round(test.pearson.uk.24$estimate, 3)

plot.pearson.uk <- data.uk %>%
  ggplot(aes(
    x = IDS_pref,
    y = vocab_nwords,
    colour = CDI.agerange
  )) +
  xlab("Standardized IDS Preference") +
  ylab("Vocabulary Size (Total Words)") +
  labs(title="UK") +
  labs(colour = "Age \nGroup") +
  geom_point() +
  geom_smooth(method = lm ) +
    scale_color_manual(values = c("#D81159","#006BA6")) +
  annotate("text",x = xrng[2], y = 415,
   hjust = 1, vjust = 0, parse = T, size = 3,
    label = paste(cor_text, cor_value_24, sep = "~")) +
  annotate("text",x = xrng[2], y = 1,
   hjust = 1, vjust = 0, parse = T, size = 3,
    label = paste(cor_text, cor_value_18,sep = "~")) + theme_apa() +  ylim(0,420)
  


# Global plot
plot.pearson <- ggarrange(plot.pearson.nae, plot.pearson.uk, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 26 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 26 rows containing missing values (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'

plot.pearson

ggsave("plots/corr_plot.png", plot.pearson,
  units = "mm", width = 180, height = 100, dpi = 1000
)

We see no obvious direct link between IDS prefernce and CDI score here. However, an effect might appear once we take into account various factors that might interact with IDS preference and/or CDI score. We can also first enhance these plots with information about the age group at which infants were tested (18- or 24-month-old) for the NAE sample, using vocabulary size to better compare the NAE and UK samples.

3.2 Mixed-Effects Model by Language Zone

Here, we run a mixed-effects model including only theoretically motivated effects, as described in the pre-registration. We start with the full model bellow, simplifying the random effects structure until it converges.

3.2.1 NAE full model

# Run models =====================================
## NAE


lmer.full.nae <- lmer(standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months + method + z.IDS_pref + z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique),
data = data.nae
)

summary(lmer.full.nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## standardized.score.CDI.num ~ CDI.z_age_months + gender + z_age_months +  
##     method + z.IDS_pref + z.IDS_pref:method + z.IDS_pref:CDI.z_age_months +  
##     z.IDS_pref:z_age_months + (1 | labid) + (1 | subid_unique)
##    Data: data.nae
## 
## REML criterion at convergence: 2795.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.19986 -0.50964 -0.00062  0.40872  2.40426 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 554.4    23.55   
##  labid        (Intercept)  34.8     5.90   
##  Residual                 219.1    14.80   
## Number of obs: 307, groups:  subid_unique, 211; labid, 8
## 
## Fixed effects:
##                             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)                  31.2694     6.4714  54.2698   4.832 1.15e-05 ***
## CDI.z_age_months              1.1569     0.3412 118.4061   3.390 0.000949 ***
## gender1                      -1.4068     1.8876 194.1693  -0.745 0.456997    
## z_age_months                 -0.1038     0.6287  70.3249  -0.165 0.869311    
## method1                       6.5371     6.5801 128.2567   0.993 0.322356    
## method2                     -15.0001    11.9274 179.3288  -1.258 0.210165    
## z.IDS_pref                  -33.9167    24.7927 208.6284  -1.368 0.172780    
## method1:z.IDS_pref           26.2532    25.4357 208.7560   1.032 0.303201    
## method2:z.IDS_pref          -56.6834    49.5010 207.8344  -1.145 0.253486    
## CDI.z_age_months:z.IDS_pref   0.5553     1.0468 120.4990   0.530 0.596748    
## z_age_months:z.IDS_pref       2.1822     2.0830 190.6377   1.048 0.296136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 methd2 z.IDS_ m1:.ID m2:.ID
## CDI.z_g_mnt -0.058                                                         
## gender1     -0.029  0.021                                                  
## z_age_mnths  0.035 -0.036  -0.018                                          
## method1     -0.725 -0.001   0.021  0.102                                   
## method2      0.884 -0.028  -0.033  0.020 -0.882                            
## z.IDS_pref   0.363  0.005   0.038 -0.040 -0.352  0.390                     
## mthd1:.IDS_ -0.340 -0.027  -0.027  0.017  0.366 -0.389 -0.926              
## mthd2:.IDS_  0.355  0.020   0.061 -0.007 -0.366  0.396  0.965 -0.971       
## CDI.__:.IDS -0.011  0.019  -0.019 -0.019 -0.027  0.007 -0.048  0.026 -0.045
## z_g_m:.IDS_ -0.041  0.025   0.102  0.052 -0.037  0.023  0.022 -0.087  0.132
##             CDI.__:
## CDI.z_g_mnt        
## gender1            
## z_age_mnths        
## method1            
## method2            
## z.IDS_pref         
## mthd1:.IDS_        
## mthd2:.IDS_        
## CDI.__:.IDS        
## z_g_m:.IDS_ -0.068
# robust_lmer.full.nae <- robustlmm::rlmer(standardized.score.CDI.num ~ CDI.z_age_months + gender +
#                         z_age_months + method + z.IDS_pref +
#                         z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months +
#                         (1 | labid),
#                       data = data.nae)
#
#
# summary(robust_lmer.full.nae) #this model is used to see if we can meet some statistical assumption better but we decided to use the original model as the inferential statistical results are consistent

full.nae_pvalue <- anova(lmer.full.nae) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# z.IDS_pref:z_age_months
# z.IDS_pref:CDI.z_age_months
# z.IDS_pref:method
# z.IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# ==========

3.2.1.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity
# data.nae$resid <- residuals(lmer.full.nae)
#
# plot(data.nae$resid, data.nae$standardized.score.CDI)

# Second, check normality
plot_model(lmer.full.nae, type = "diag") ## we do have right-skewed normality of residuals
## Warning in checkDepPackageVersion(dep_pkg = "TMB"): Package version inconsistency detected.
## glmmTMB was built with TMB version 1.9.3
## Current TMB version is 1.9.4
## Please re-install glmmTMB from source or restore original 'TMB' package (see '?reinstalling' for more information)
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula = 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.nae <- nlme::lme(standardized.score.CDI.num ~ CDI.z_age_months + gender +
  z_age_months + method + z.IDS_pref +
  z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.nae, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.nae) # we do see a multicollineartiy for the IDS preference variable, even though we have centered the IDS preference score. It is probably related to the number the participating labs (as this is the group level that we are controlling) and how we entered interaction between IDS preference and other variables (that lack variability in the current sample). We need to keep IDS preference in the model as exploring the relationship between IDS preference and CDI score is the key research question in the paper.
##                                  GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months             1.008839  1        1.004410
## gender                       1.038941  1        1.019285
## z_age_months                 1.098900  1        1.048284
## method                       1.298443  2        1.067470
## z.IDS_pref                  19.218063  1        4.383841
## method:z.IDS_pref           20.784955  2        2.135194
## CDI.z_age_months:z.IDS_pref  1.014299  1        1.007124
## z_age_months:z.IDS_pref      1.316157  1        1.147239

3.2.1.2 Table Summary

modelsummary_lmer_full_nae<- list(
  "Full Model" = lmer.full.nae)

msummary(modelsummary_lmer_full_nae, output = "tables/lmer_full_nae.docx",
             stars = TRUE, align = "lc",metrics=c("RMSE","R2","AIC", "BIC","Log.Lik.","F"))

3.2.2 UK full model

lmer.full.uk <- lmer(vocab_nwords ~ CDI.z_age_months + gender +
  z_age_months + method + z.IDS_pref +
  z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months +
  # (1 | labid) +
  (1 | subid_unique),
data = data.uk
)

summary(lmer.full.uk)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: vocab_nwords ~ CDI.z_age_months + gender + z_age_months + method +  
##     z.IDS_pref + z.IDS_pref:method + z.IDS_pref:CDI.z_age_months +  
##     z.IDS_pref:z_age_months + (1 | subid_unique)
##    Data: data.uk
## 
## REML criterion at convergence: 1001.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.61591 -0.38896 -0.02037  0.37307  1.93394 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 4834     69.53   
##  Residual                 2209     47.00   
## Number of obs: 92, groups:  subid_unique, 76
## 
## Fixed effects:
##                             Estimate Std. Error      df t value Pr(>|t|)    
## (Intercept)                  182.366     10.307  68.349  17.693  < 2e-16 ***
## CDI.z_age_months              33.925      2.345  28.970  14.467 8.68e-15 ***
## gender1                       10.142     10.156  67.389   0.999   0.3215    
## z_age_months                  -8.461      3.340  70.490  -2.533   0.0135 *  
## method1                      -12.223     12.511  66.201  -0.977   0.3321    
## z.IDS_pref                    25.907     27.203  72.778   0.952   0.3441    
## method1:z.IDS_pref           -25.499     29.465  72.479  -0.865   0.3897    
## CDI.z_age_months:z.IDS_pref   10.598      7.059  34.815   1.501   0.1423    
## z_age_months:z.IDS_pref       -8.352      8.978  78.184  -0.930   0.3551    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) CDI.z__ gendr1 z_g_mn methd1 z.IDS_ m1:.ID CDI.__:
## CDI.z_g_mnt  0.172                                                   
## gender1     -0.052 -0.086                                            
## z_age_mnths -0.037 -0.200  -0.125                                    
## method1     -0.225  0.006  -0.154  0.567                             
## z.IDS_pref   0.034  0.042  -0.068 -0.028  0.157                      
## mthd1:.IDS_  0.136  0.053   0.056  0.011  0.026 -0.104               
## CDI.__:.IDS -0.010  0.142   0.019 -0.022  0.087  0.233 -0.138        
## z_g_m:.IDS_ -0.078 -0.039  -0.214 -0.022  0.005 -0.187  0.425 -0.345
full.uk_pvalue <- anova(lmer.full.uk) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# z.IDS_pref:z_age_months
# z.IDS_pref:CDI.z_age_months
# z.IDS_pref:method
# z.IDS_pref
# method
# z_age_months
# gender
# CDI.z_age_months
# (1| labid) please note the variance was very little and reported as zero in the results, we needed to remove this random effect

3.2.2.1 (Optional) Checking mixed-model assumptions. We will check the following:

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity. The plot looks linear
data.uk$resid <- residuals(lmer.full.uk)

plot(data.uk$resid, data.uk$vocab_nwords)

# Second, check normality
plot_model(lmer.full.uk, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula = 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.uk <- nlme::lme(vocab_nwords ~ CDI.z_age_months + gender +
  z_age_months + method + z.IDS_pref +
  z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.uk, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.uk) # no problem for multicollinearlity
##            CDI.z_age_months                      gender 
##                    1.107426                    1.141095 
##                z_age_months                      method 
##                    1.628318                    1.598069 
##                  z.IDS_pref           method:z.IDS_pref 
##                    1.127320                    1.273962 
## CDI.z_age_months:z.IDS_pref     z_age_months:z.IDS_pref 
##                    1.204203                    1.507036

We now want to check the statistical power of significant effects, and discard any models with significant effects that do not reach 80% power. This however leads to too many warnings of singularity issues on the model updates inherent to the simr power simulations, hence we cannot obtain satisfactory power estimates as pre-registered.

AST: Note that we don’t have any IV(s) that turned out to be significant in the Full NAE model. So we won’t run the power analysis check. For the UK full model, there are two statistically significant IV: CDI_age and gender. The post hoc power check suggested that we have high power in detecting the effect of CDI_age but not gender. Note that gender has a smaller effect size to begin with, so this may partially explain why we have less power in detecting it in the model. As there can be a number of different factors that determines the posthoc power, we decided not to remove gender in the model based on posthoc power analysis check.

check_pwr_uk_cdi_age <- simr::powerSim(lmer.full.uk, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the fixed effect that we are looking into

check_pwr_uk_cdi_age

check_pwr_uk_gender <- simr::powerSim(lmer.full.uk, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the fixed effect that we are looking into

check_pwr_uk_gender

3.2.2.2 Table Summary

modelsummary_lmer_full_uk<- list(
  "Full Model" = lmer.full.uk)

msummary(modelsummary_lmer_full_uk, output = "tables/lmer_full_uk.docx",
             stars = TRUE, align = "lc",metrics=c("RMSE","R2","AIC", "BIC","Log.Lik.","F"))

3.2.3 Combined Sample

For this combined analysis, we first need to restrain the age range for the NAE sample (previously ±2 months, now ±0.5 months).

# Create dataset with British and NAE only
before_exclusion_participants <- data.total %>%
  filter(language_zone == "NAE" | language_zone == "British") %>%
  distinct(subid_unique) %>%
  count()
before_exclusion_CDIs <- data.total %>%
  filter(language_zone == "NAE" | language_zone == "British") %>%
  count()

data.uk_nae <- data.total %>%
  subset(language_zone %in% c("British", "NAE")) %>%
  mutate(
    CDI.agemin = ifelse(language_zone == "NAE",
      CDI.agemin + round(.5 * 365.25 / 12),
      CDI.agemin
    ),
    CDI.agemax = ifelse(language_zone == "NAE",
      CDI.agemax - round(.5 * 365.25 / 12),
      CDI.agemax
    )
  ) %>%
  subset(!(CDI.agedays < CDI.agemin | CDI.agedays > CDI.agemax)) %>%
  droplevels()
# Create contrasts for analysis
contrasts(data.uk_nae$gender) <- contr.sum(2)
contrasts(data.uk_nae$method) <- contr.sum(3)
contrasts(data.uk_nae$language_zone) <- contr.sum(2)

after_exclusion_participants <- data.uk_nae %>%
  distinct(subid_unique) %>%
  count()
after_exclusion_CDIs <- count(data.uk_nae)

We go from 304 to 290 total participants in the combined sample, meaning that 14 participants were excluded from the North American sample. In total, 28 rows of data were removed.

We can then run the planned combined analysis adding the main effect and interactions of language_zone.

lmer.full.uk_nae <- lmer(CDI.prop ~ CDI.z_age_months + language_zone + gender +
  z_age_months + method + z.IDS_pref + z.IDS_pref:language_zone +
  z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months +
  (1 | labid) + (1 | subid_unique),
data = data.uk_nae
)

summary(lmer.full.uk_nae)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: CDI.prop ~ CDI.z_age_months + language_zone + gender + z_age_months +  
##     method + z.IDS_pref + z.IDS_pref:language_zone + z.IDS_pref:method +  
##     z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months + (1 |  
##     labid) + (1 | subid_unique)
##    Data: data.uk_nae
## 
## REML criterion at convergence: -97.6
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.89103 -0.50634 -0.00619  0.44106  2.47651 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  subid_unique (Intercept) 0.020831 0.14433 
##  labid        (Intercept) 0.001502 0.03875 
##  Residual                 0.020069 0.14166 
## Number of obs: 397, groups:  subid_unique, 290; labid, 13
## 
## Fixed effects:
##                               Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)                   0.323947   0.019276  11.257953  16.805 2.52e-09
## CDI.z_age_months              0.052436   0.002833 183.929733  18.506  < 2e-16
## language_zone1                0.096761   0.025176  11.529968   3.843  0.00251
## gender1                       0.029054   0.011460 269.840429   2.535  0.01180
## z_age_months                 -0.004151   0.003845 133.024167  -1.079  0.28233
## method1                      -0.014103   0.024982  17.331391  -0.565  0.57963
## method2                       0.001297   0.039288  15.954205   0.033  0.97407
## z.IDS_pref                    0.007971   0.040343 296.222646   0.198  0.84351
## language_zone1:z.IDS_pref     0.029449   0.058002 307.606650   0.508  0.61201
## method1:z.IDS_pref           -0.057186   0.052785 306.712106  -1.083  0.27949
## method2:z.IDS_pref            0.043003   0.087955 311.318982   0.489  0.62524
## CDI.z_age_months:z.IDS_pref   0.009680   0.008311 184.394430   1.165  0.24565
## z_age_months:z.IDS_pref      -0.004877   0.011250 274.134058  -0.434  0.66497
##                                
## (Intercept)                 ***
## CDI.z_age_months            ***
## language_zone1              ** 
## gender1                     *  
## z_age_months                   
## method1                        
## method2                        
## z.IDS_pref                     
## language_zone1:z.IDS_pref      
## method1:z.IDS_pref             
## method2:z.IDS_pref             
## CDI.z_age_months:z.IDS_pref    
## z_age_months:z.IDS_pref        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 13 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
combined.full.uk_nae_pvalue <- anova(lmer.full.uk_nae) %>%
  as_tibble(rownames = "Parameter") # this gives us the Type III p values

# ==========
# Sequentially removed random effects:
# z.IDS_pref:z_age_months
# z.IDS_pref:CDI.z_age_months
# z.IDS_pref:method
# z.IDS_pref:language_zone
# z.IDS_pref
# method
# z_age_months
# gender
# language_zone
# CDI.z_age_months
# ==========

3.2.3.1 (Optional) Checking mixed-model assumptions

  1. Linearlity
  2. Normality of the residuals
  3. Homoscedasticity of residuals
  4. No autocorrelation
  5. No multicollinearity
# First, check linearity. The plot looks linear
data.uk_nae$resid <- residuals(lmer.full.uk_nae)

plot(data.uk_nae$resid, data.uk_nae$CDI.prop)

# Second, check normality
plot_model(lmer.full.uk_nae, type = "diag") ## we do have right-skewed normality of residuals
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]
## [[2]]$subid_unique
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[2]]$labid
## `geom_smooth()` using formula = 'y ~ x'

## 
## 
## [[3]]

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'

# Third, check autocorrelation
re_run_lme.full.uk_nae <- nlme::lme(CDI.prop ~ CDI.z_age_months + language_zone + gender +
  z_age_months + method + z.IDS_pref + z.IDS_pref:language_zone +
  z.IDS_pref:method + z.IDS_pref:CDI.z_age_months + z.IDS_pref:z_age_months,
random = ~ 1 | labid,
method = "REML",
data = data.uk_nae, na.action = na.exclude
)

plot(nlme::ACF(re_run_lme.full.uk_nae, resType = "normalized")) # there is no sign for autocorrelation

# Lastly, check multi-collinearity
car::vif(lmer.full.uk_nae) # no problem for multicollinearlity
##                                 GVIF Df GVIF^(1/(2*Df))
## CDI.z_age_months            1.030475  1        1.015123
## language_zone               1.998549  1        1.413701
## gender                      1.033569  1        1.016646
## z_age_months                1.150985  1        1.072840
## method                      2.245758  2        1.224167
## z.IDS_pref                  1.466093  1        1.210823
## language_zone:z.IDS_pref    3.033182  1        1.741603
## method:z.IDS_pref           3.378960  2        1.355800
## CDI.z_age_months:z.IDS_pref 1.050537  1        1.024957
## z_age_months:z.IDS_pref     1.201681  1        1.096212

We then compute \(p\)-values, but leave out power estimates for those \(p\)-values as above. Again, we have a lot of singular fit issues for the power checks and decided not to remove parameters based on posthoc power analysis.

check_pwr_combined_cdi_age <- simr::powerSim(lmer.full.uk_nae, test = fixed("CDI.z_age_months", method = "z"), seed = 2, nsim = 1000, alpha = 0.05) # specify that Gender is the fixed effect that we are looking into

check_pwr_combined_cdi_age

check_pwr_combined_lang_zone <- simr::powerSim(lmer.full.uk_nae, test = fixed("language_zone", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the fixed effect that we are looking into

check_pwr_combined_lang_zone

check_pwr_combined_gender <- simr::powerSim(lmer.full.uk_nae, test = fixed("gender", method = "z"), seed = 3, nsim = 1000, alpha = 0.05) # specify that Gender is the fixed effect that we are looking into

check_pwr_combined_gender

3.2.3.2 Table Summary

modelsummary_lmer_full_uk_nae<- list(
  "Full Model" = lmer.full.uk_nae)

msummary(modelsummary_lmer_full_uk_nae, output = "tables/lmer_full_uk_nae.docx",
             stars = TRUE, align = "lc",metrics=c("RMSE","R2","AIC", "BIC","Log.Lik.","F"))

3.2.4 Summary table all 3 models

modelsummary_all_three_models<- list(
  "NAE Full Model"= lmer.full.nae,
  "UK Full Model" = lmer.full.uk,
  "NAE+UK Full Model" = lmer.full.uk_nae)

msummary(modelsummary_all_three_models, output = "tables/lmer_full_three_models.docx",
             stars = TRUE, align = "lccc",metrics=c("RMSE","R2","AIC", "BIC","Log.Lik.","F"))
## Loading required package: lmerTest
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: ‘lmerTest’
## The following object is masked from ‘package:lme4’:
## 
##     lmer
## The following object is masked from ‘package:stats’:
## 
##     step
## Loading required package: lmerTest
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: ‘lmerTest’
## The following object is masked from ‘package:lme4’:
## 
##     lmer
## The following object is masked from ‘package:stats’:
## 
##     step
## Loading required package: lmerTest
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: ‘lmerTest’
## The following object is masked from ‘package:lme4’:
## 
##     lmer
## The following object is masked from ‘package:stats’:
## 
##     step